********************************************************************************
* Gender Gaps in Formal Employment: Evidence from Bangladesh
* Analysis Code for BBS Labour Force Survey 2024
* 
* Author: Tanvir Thamid
* Date: January 2026
********************************************************************************

clear all
set more off
capture log close

* Set paths - MODIFY THESE FOR YOUR SYSTEM
* NOTE: Check your actual folder names. The original code used globals defined elsewhere.
* Common setups:
*   global datapath_emp "/Users/tanvirthamid/Desktop/Labor/LFS_2024/Employment Education"
*   global datapath_emp "/Users/tanvirthamid/Desktop/Labor/LFS_2024/Data/Employment"
* Adjust to match your directory structure.

global datapath_emp "/Users/tanvirthamid/Desktop/Labor/LFS_2024/Employment Education"
global datapath_se  "/Users/tanvirthamid/Desktop/Labor/LFS_2024/Socio Economic"
global output       "/Users/tanvirthamid/Desktop/Labor/LFS_2024/Output"

log using "$output/analysis_log.txt", text replace


********************************************************************************
* SECTION 1: APPEND AND MERGE DATA
********************************************************************************

*-------------------------------------------------------------------------------
* 1.1 Append Employment-Education Data (Q1-Q4)
*-------------------------------------------------------------------------------

use "$datapath_emp/Q1_Employment_Education 24.dta", clear
gen quarter = 1

tempfile q1
save `q1'

use "$datapath_emp/Q2_Employment_Education 24.dta", clear
gen quarter = 2

tempfile q2
save `q2'

use "$datapath_emp/Q3_Employment_Education 24.dta", clear
gen quarter = 3

tempfile q3
save `q3'

use "$datapath_emp/Q4_Employment _Education 24.dta", clear
gen quarter = 4

* Append all quarters
append using `q1'
append using `q2'
append using `q3'

* Create unique household ID for merging
gen hhid = string(PSU) + "_" + string(EAUM) + "_" + string(HHNO)

* Save appended employment data
save "$output/employment_all.dta", replace

di "Employment data appended: `=_N' observations"


*-------------------------------------------------------------------------------
* 1.2 Append Socio-Economic Data (Q1-Q4)
*-------------------------------------------------------------------------------

use "$datapath_se/BBS_LFS_Q1_2024_Socio_Economic.dta", clear
gen quarter = 1
tempfile se1
save `se1'

use "$datapath_se/BBS_LFS_Q2_2024_Socio_Economic.dta", clear
gen quarter = 2
tempfile se2
save `se2'

use "$datapath_se/BBS_LFS_Q3_2024_Socio_Economic.dta", clear
gen quarter = 3
tempfile se3
save `se3'

use "$datapath_se/BBS_LFS_Q4_2024_Socio_Economic.dta", clear
gen quarter = 4

append using `se1'
append using `se2'
append using `se3'

* Create unique household ID
gen hhid = string(PSU) + "_" + string(EAUM) + "_" + string(HHNO)

* Keep household-level vars for wealth index
keep hhid quarter HI3 HI4 HI5 HI7 HI8 HI9 HI10 HI11 HI12 ///
     HI13A HI13B HI13C HI13D HI13E HI13F HI13G HI13H HI13I ///
     HI13J HI13K HI13L HI13M HI13N HI13O

* Remove duplicates - keep one obs per household per quarter
duplicates drop hhid quarter, force

save "$output/socioeco_all.dta", replace

di "Socio-economic data appended"


*-------------------------------------------------------------------------------
* 1.3 Merge Employment and Socio-Economic Data
*-------------------------------------------------------------------------------

use "$output/employment_all.dta", clear

merge m:1 hhid quarter using "$output/socioeco_all.dta"

tab _merge
drop _merge

save "$output/merged_data.dta", replace

di "Data merged: `=_N' observations"


********************************************************************************
* SECTION 2: VARIABLE CONSTRUCTION
********************************************************************************

use "$output/merged_data.dta", clear

*-------------------------------------------------------------------------------
* 2.1 Core Variables
*-------------------------------------------------------------------------------

* Sex variable (0=male, 1=female)
* EMP_HR03: 1=MALE, 2=FEMALE, 3=TRANSGENDER
gen female = .
replace female = 0 if EMP_HR03 == 1
replace female = 1 if EMP_HR03 == 2
label define female_lb 0 "Male" 1 "Female"
label values female female_lb

* Age 
gen age = EMP_HR04

* Employment status - keep only employed
* BBS_lfs13: 1=Employed, 2=Unemployed, 3=Outside Labour Force
gen employed = (BBS_lfs13 == 1)
keep if employed == 1
drop employed

di "Sample restricted to employed: `=_N'"

* Formal employment (outcome variable)
* BBS_job1_ife_nature: 1=Informal, 2=Formal
gen formal = .
replace formal = 1 if BBS_job1_ife_nature == 2
replace formal = 0 if BBS_job1_ife_nature == 1
label define formal_lb 0 "Informal" 1 "Formal"
label values formal formal_lb

* Drop missing formality
drop if formal == .
di "Sample with non-missing formality: `=_N'"


*-------------------------------------------------------------------------------
* 2.2 Education Categories
*-------------------------------------------------------------------------------

* L_education: 0=None, 1=primary, 2=secondary, 3=Higher secondary, 4=Tertiary, 5=Others
gen educ = .
replace educ = 0 if L_education == 0
replace educ = 1 if L_education == 1
replace educ = 2 if L_education == 2
replace educ = 3 if L_education == 3
replace educ = 4 if L_education == 4
replace educ = . if L_education == 5

label define educ_lb 0 "None" 1 "Primary" 2 "Secondary" 3 "Higher Secondary" 4 "Tertiary"
label values educ educ_lb


*-------------------------------------------------------------------------------
* 2.3 Occupation (ISCO-08 1-digit)
*-------------------------------------------------------------------------------

* Extract 1-digit occupation code from the detailed code
gen occ_code = floor(BBS_job1_ocu_isco08_2digits / 1000)

* Professional occupation dummy (ISCO 1-3: Managers, Professionals, Technicians)
gen professional_occ = (occ_code >= 1 & occ_code <= 3)
replace professional_occ = . if occ_code == .

label define profocc_lb 0 "Non-professional" 1 "Professional (ISCO 1-3)"
label values professional_occ profocc_lb


*-------------------------------------------------------------------------------
* 2.4 Industry (ISIC Rev 4 - 2-digit)
*-------------------------------------------------------------------------------

* Extract 2-digit industry code
gen ind_code = floor(BBS_job1_eco_isic4_2digits / 1000)

* Industry groupings
gen industry_group = .
replace industry_group = 1 if ind_code >= 1 & ind_code <= 3   // Agriculture, forestry, fishing
replace industry_group = 2 if ind_code >= 5 & ind_code <= 9   // Mining
replace industry_group = 3 if ind_code >= 10 & ind_code <= 33 // Manufacturing
replace industry_group = 4 if ind_code >= 35 & ind_code <= 39 // Utilities
replace industry_group = 5 if ind_code >= 41 & ind_code <= 43 // Construction
replace industry_group = 6 if ind_code >= 45 & ind_code <= 47 // Trade
replace industry_group = 7 if ind_code >= 49 & ind_code <= 53 // Transport
replace industry_group = 8 if ind_code >= 55 & ind_code <= 56 // Accommodation/food
replace industry_group = 9 if ind_code >= 58 & ind_code <= 63 // Information/communication
replace industry_group = 10 if ind_code >= 64 & ind_code <= 66 // Finance
replace industry_group = 11 if ind_code == 68 // Real estate
replace industry_group = 12 if ind_code >= 69 & ind_code <= 75 // Professional services
replace industry_group = 13 if ind_code >= 77 & ind_code <= 82 // Admin/support
replace industry_group = 14 if ind_code == 84 // Public administration
replace industry_group = 15 if ind_code == 85 // Education
replace industry_group = 16 if ind_code >= 86 & ind_code <= 88 // Health/social work
replace industry_group = 17 if ind_code >= 90 & ind_code <= 93 // Arts/recreation
replace industry_group = 18 if ind_code >= 94 & ind_code <= 96 // Other services
replace industry_group = 19 if ind_code == 97 // Domestic work
replace industry_group = 20 if industry_group == . & ind_code != . // Other

label define indgrp_lb 1 "Agriculture" 2 "Mining" 3 "Manufacturing" 4 "Utilities" ///
    5 "Construction" 6 "Trade" 7 "Transport" 8 "Accommodation/Food" ///
    9 "ICT" 10 "Finance" 11 "Real Estate" 12 "Professional Services" ///
    13 "Admin/Support" 14 "Public Admin" 15 "Education" 16 "Health" ///
    17 "Arts/Recreation" 18 "Other Services" 19 "Domestic Work" 20 "Other"
label values industry_group indgrp_lb

* Agriculture dummy
gen in_agriculture = (industry_group == 1)


*-------------------------------------------------------------------------------
* 2.5 Track A / Track B Definition
*-------------------------------------------------------------------------------

* Track A: Professional/Technical/Managerial (ISCO 1-3) in Education, Health, or Public Admin
gen track_a = (professional_occ == 1 & inlist(industry_group, 14, 15, 16))
replace track_a = . if professional_occ == . | industry_group == .

label define track_lb 0 "Track B" 1 "Track A"
label values track_a track_lb


*-------------------------------------------------------------------------------
* 2.6 Wealth Index Construction (PCA)
*-------------------------------------------------------------------------------

* Create binary asset indicators
* Wall materials (HI3): 1=Brick/cement, 2=CI sheet, 3=Wood, 4=Mud/bamboo, 5=Other
gen wall_pucca = (HI3 == 1 | HI3 == 5)  // Brick/cement or other permanent

* Roof materials (HI4): 1=Straw, 2=CI sheet, 3=Tiles, 4=Concrete
gen roof_pucca = (HI4 == 4)  // Concrete roof

* Floor materials (HI5): 1=Earth/mud, 2=Wood, 3=Cement/mosaic, 4=Tiles
gen floor_pucca = (HI5 == 3 | HI5 == 4)  // Cement/mosaic or tiles

* Water source (HI7): 1=Tubewell, 2=Tap/piped, 3=Well, 4=Pond, 5=River, 6=Other
gen water_improved = (HI7 == 1 | HI7 == 2)  // Tubewell or tap

* Electricity (HI8): 1=Electricity, 2=Solar, 3=Kerosene, 4=Other
gen has_electricity = (HI8 == 1 | HI8 == 2)

* Cooking fuel (HI9): 1=Wood/straw, 2=Gas, 3=Electricity, 4=Kerosene, 5=Coal, 6=Other
gen cooking_clean = (HI9 == 2 | HI9 == 3)  // Gas or electricity

* Sanitation (HI10): 1=Sanitary, 2=Pit latrine, 3=No facility, 4=Other
gen toilet_improved = (HI10 == 1)  // Sanitary toilet

* Land ownership (HI12): 0=None, 1=<50 decimals, 2=50-249, 3=250-749, 4=750+
gen land_owned = HI12 if HI12 != 99

* Asset ownership from HI13 series
gen has_tv = (HI13A == "A")
gen has_fridge = (HI13B == "B")
gen has_ac = (HI13C == "C")
gen has_fan = (HI13D == "D")
gen has_computer = (HI13E == "E")
gen has_bicycle = (HI13F == "F")
gen has_motorcycle = (HI13G == "G")
gen has_car = (HI13H == "H")
gen has_mobile = (HI13K == "K")
gen has_internet = (HI13N == "N")

* Run PCA on asset variables
pca wall_pucca roof_pucca floor_pucca water_improved has_electricity ///
    cooking_clean toilet_improved land_owned ///
    has_tv has_fridge has_ac has_fan has_computer ///
    has_bicycle has_motorcycle has_car has_mobile has_internet, components(1)

* Get first principal component as wealth score
predict wealth_score, score

* CRITICAL: Check PCA loadings direction
* Higher values of assets should = higher wealth score
* If floor_pucca or toilet_improved have NEGATIVE loadings, flip the score
di _newline "Checking PCA loadings..."
matrix loadings = e(L)
local floor_loading = loadings[3,1]
local toilet_loading = loadings[7,1]

di "Floor pucca loading: `floor_loading'"
di "Toilet improved loading: `toilet_loading'"

* If key wealth indicators have negative loadings, invert the score
if `floor_loading' < 0 | `toilet_loading' < 0 {
    di "WARNING: Inverting wealth score (negative loadings detected)"
    replace wealth_score = -1 * wealth_score
}

* Create wealth quintiles
xtile wealth_quintile = wealth_score, nq(5)
label define wq_lb 1 "Q1 (Poorest)" 2 "Q2" 3 "Q3" 4 "Q4" 5 "Q5 (Richest)"
label values wealth_quintile wq_lb


*-------------------------------------------------------------------------------
* 2.7 Earnings Quintiles (within gender)
*-------------------------------------------------------------------------------

* Use MJ_15A for total monthly earnings (more comprehensive than WT_02)
* MJ_15A captures earnings from main job for all workers, not just wage employees
gen earnings = MJ_15A if MJ_15A > 0 & MJ_15A < .

* Create within-gender earnings quintiles
gen earn_quintile = .
xtile eq_male = earnings if earnings > 0 & female == 0, nq(5)
xtile eq_female = earnings if earnings > 0 & female == 1, nq(5)
replace earn_quintile = eq_male if female == 0
replace earn_quintile = eq_female if female == 1
drop eq_male eq_female

label define eq_lb 1 "Q1 (Lowest)" 2 "Q2" 3 "Q3" 4 "Q4" 5 "Q5 (Highest)"
label values earn_quintile eq_lb

di "Workers with earnings data: " 
count if earnings > 0 & earnings < .


*-------------------------------------------------------------------------------
* 2.8 Rural/Urban
*-------------------------------------------------------------------------------

* RU: 1=Urban, 2=Rural, 3=City Corporation
gen urban = (RU == 1 | RU == 3)
label define urban_lb 0 "Rural" 1 "Urban"
label values urban urban_lb


*-------------------------------------------------------------------------------
* 2.9 Sector
*-------------------------------------------------------------------------------

* BBS_job1_ins_sector: 1=Public, 2=Private
gen public_sector = (BBS_job1_ins_sector == 1)
label define public_lb 0 "Private" 1 "Public"
label values public_sector public_lb


* Save analysis dataset
save "$output/analysis_data.dta", replace

di "Analysis dataset saved with `=_N' observations"


********************************************************************************
* SECTION 3: DESCRIPTIVE STATISTICS (TABLE 1)
********************************************************************************

use "$output/analysis_data.dta", clear

* Table 1: Descriptive Statistics by Gender

* Overall sample
di "=== SAMPLE COMPOSITION ==="
tab female

* Formal employment rate by gender
di _newline "=== FORMAL EMPLOYMENT BY GENDER ==="
tab female formal, row

* Mean formality by gender
sum formal if female == 0
local male_formal = r(mean) * 100
sum formal if female == 1
local female_formal = r(mean) * 100
local raw_gap = `male_formal' - `female_formal'

di _newline "Male formality rate: " %5.1f `male_formal' "%"
di "Female formality rate: " %5.1f `female_formal' "%"
di "Raw gender gap: " %5.1f `raw_gap' " percentage points"

* Education distribution
di _newline "=== EDUCATION DISTRIBUTION ==="
tab educ female, col nofreq

* Industry distribution
di _newline "=== AGRICULTURE SHARE ==="
sum in_agriculture if female == 0
di "Male in agriculture: " %5.1f r(mean)*100 "%"
sum in_agriculture if female == 1
di "Female in agriculture: " %5.1f r(mean)*100 "%"

* Track A/B distribution
di _newline "=== TRACK A SHARE ==="
sum track_a if female == 0
di "Male in Track A: " %5.1f r(mean)*100 "%"
sum track_a if female == 1
di "Female in Track A: " %5.1f r(mean)*100 "%"


********************************************************************************
* SECTION 4: TWO-TRACK ANALYSIS (FIGURE 1)
********************************************************************************

di _newline "========================================"
di "TWO-TRACK ANALYSIS"
di "========================================"

* Track A formality by gender
di _newline "Track A (Professional/Public Sector):"
sum formal if track_a == 1 & female == 0
local track_a_male = r(mean) * 100
sum formal if track_a == 1 & female == 1
local track_a_female = r(mean) * 100
local track_a_gap = `track_a_male' - `track_a_female'

di "  Male: " %5.1f `track_a_male' "%"
di "  Female: " %5.1f `track_a_female' "%"
di "  Gap: " %5.1f `track_a_gap' " pp"

* Track B formality by gender
di _newline "Track B (All Other):"
sum formal if track_a == 0 & female == 0
local track_b_male = r(mean) * 100
sum formal if track_a == 0 & female == 1
local track_b_female = r(mean) * 100
local track_b_gap = `track_b_male' - `track_b_female'

di "  Male: " %5.1f `track_b_male' "%"
di "  Female: " %5.1f `track_b_female' "%"
di "  Gap: " %5.1f `track_b_gap' " pp"

* Export data for Figure 1
preserve
    drop if female == . | track_a == .
    collapse (mean) formal, by(track_a female)
    replace formal = formal * 100
    
    reshape wide formal, i(track_a) j(female)
    rename formal0 male_rate
    rename formal1 female_rate
    gen gap = male_rate - female_rate
    
    list
    export delimited using "$output/figure1_twotrack.csv", replace
restore

* Figure 1: Two-Track Formality
preserve
    drop if female == . | track_a == .
    collapse (mean) formal, by(track_a female)
    replace formal = formal * 100
    
    graph bar formal, over(female) over(track_a) ///
        ytitle("Formal Employment Rate (%)") ///
        title("Figure 1: Formality Rates by Labor Market Track") ///
        subtitle("Track A: Professional/Public Sector; Track B: All Other") ///
        legend(order(1 "Male" 2 "Female")) ///
        bar(1, color(navy)) bar(2, color(maroon)) ///
        blabel(bar, format(%4.1f))
    
    graph export "$output/figure1_twotrack.png", replace width(1200)
restore


********************************************************************************
* SECTION 5: OAXACA-BLINDER DECOMPOSITION (TABLE 2)
********************************************************************************

di _newline "========================================"
di "OAXACA-BLINDER DECOMPOSITION"
di "========================================"

* Create dummy variables for decomposition
tab educ, gen(educ_d)
tab occ_code, gen(occ_d)
tab industry_group, gen(ind_d)

* Mark sample for decomposition (non-missing on all vars)
gen sample_ob = !missing(formal, female, educ, occ_code, industry_group)
tab sample_ob

* Run Oaxaca decomposition
oaxaca formal educ_d* occ_d* ind_d* if sample_ob == 1, by(female) weight(1) relax

* Summary
di _newline "========================================"
di "DECOMPOSITION SUMMARY"
di "========================================"

qui sum formal if female == 0 & sample_ob == 1
local male_mean = r(mean)
qui sum formal if female == 1 & sample_ob == 1
local female_mean = r(mean)
local raw_gap = `male_mean' - `female_mean'

di "Male formality rate: " %5.3f `male_mean'
di "Female formality rate: " %5.3f `female_mean'
di "Raw gap: " %5.3f `raw_gap'


********************************************************************************
* SECTION 6: WEALTH ANALYSIS (FIGURE 3)
********************************************************************************

di _newline "========================================"
di "WEALTH QUINTILE ANALYSIS"
di "========================================"

forvalues q = 1/5 {
    di _newline "Quintile `q':"
    qui sum formal if wealth_quintile == `q' & female == 0
    local male_q`q' = r(mean) * 100
    qui sum formal if wealth_quintile == `q' & female == 1
    local female_q`q' = r(mean) * 100
    local gap_q`q' = `male_q`q'' - `female_q`q''
    di "  Male: " %5.1f `male_q`q'' "%"
    di "  Female: " %5.1f `female_q`q'' "%"
    di "  Gap: " %5.1f `gap_q`q'' " pp"
}

* Export data for Figure 3
preserve
    drop if female == . | wealth_quintile == .
    collapse (mean) formal_rate = formal, by(wealth_quintile female)
    replace formal_rate = formal_rate * 100
    
    reshape wide formal_rate, i(wealth_quintile) j(female)
    rename formal_rate0 male_rate
    rename formal_rate1 female_rate
    gen gap = male_rate - female_rate
    
    list
    export delimited using "$output/figure3_wealth.csv", replace
restore

* Figure 3: Wealth Quintiles
preserve
    drop if female == . | wealth_quintile == .
    collapse (mean) formal, by(wealth_quintile female)
    replace formal = formal * 100
    
    reshape wide formal, i(wealth_quintile) j(female)
    rename formal0 male
    rename formal1 female
    
    twoway (connected male wealth_quintile, msymbol(O) mcolor(navy) lcolor(navy)) ///
           (connected female wealth_quintile, msymbol(S) mcolor(maroon) lcolor(maroon)), ///
        ytitle("Formal Employment Rate (%)") ///
        xtitle("Wealth Quintile") ///
        xlabel(1 "Q1 (Poorest)" 2 "Q2" 3 "Q3" 4 "Q4" 5 "Q5 (Richest)") ///
        title("Figure 3: Formal Employment by Wealth Quintile") ///
        legend(order(1 "Male" 2 "Female"))
    
    graph export "$output/figure3_wealth.png", replace width(1200)
restore


********************************************************************************
* SECTION 7: EARNINGS QUINTILE ANALYSIS (FIGURE 4)
********************************************************************************

di _newline "========================================"
di "EARNINGS QUINTILE ANALYSIS"
di "========================================"

forvalues q = 1/5 {
    di _newline "Earnings Quintile `q':"
    qui sum formal if earn_quintile == `q' & female == 0
    if r(N) > 0 {
        local male_e`q' = r(mean) * 100
    }
    else {
        local male_e`q' = .
    }
    qui sum formal if earn_quintile == `q' & female == 1
    if r(N) > 0 {
        local female_e`q' = r(mean) * 100
    }
    else {
        local female_e`q' = .
    }
    local gap_e`q' = `male_e`q'' - `female_e`q''
    di "  Male: " %5.1f `male_e`q'' "%"
    di "  Female: " %5.1f `female_e`q'' "%"
    di "  Gap: " %5.1f `gap_e`q'' " pp"
}

* Export data for Figure 4
preserve
    drop if female == . | earn_quintile == .
    collapse (mean) formal_rate = formal, by(earn_quintile female)
    replace formal_rate = formal_rate * 100
    
    reshape wide formal_rate, i(earn_quintile) j(female)
    rename formal_rate0 male_rate
    rename formal_rate1 female_rate
    gen gap = male_rate - female_rate
    
    list
    export delimited using "$output/figure4_earnings.csv", replace
restore

* Figure 4: Earnings Quintiles
preserve
    drop if female == . | earn_quintile == .
    collapse (mean) formal, by(earn_quintile female)
    replace formal = formal * 100
    
    reshape wide formal, i(earn_quintile) j(female)
    rename formal0 male
    rename formal1 female
    
    twoway (connected male earn_quintile, msymbol(O) mcolor(navy) lcolor(navy)) ///
           (connected female earn_quintile, msymbol(S) mcolor(maroon) lcolor(maroon)), ///
        ytitle("Formal Employment Rate (%)") ///
        xtitle("Within-Gender Earnings Quintile") ///
        xlabel(1 "Q1 (Lowest)" 2 "Q2" 3 "Q3" 4 "Q4" 5 "Q5 (Highest)") ///
        title("Figure 4: Formal Employment by Earnings Quintile") ///
        subtitle("Within-gender quintiles") ///
        legend(order(1 "Male" 2 "Female"))
    
    graph export "$output/figure4_earnings.png", replace width(1200)
restore


********************************************************************************
* SECTION 8: EDUCATION ANALYSIS (FIGURE 5)
********************************************************************************

di _newline "========================================"
di "EDUCATION LEVEL ANALYSIS"
di "========================================"

forvalues e = 0/4 {
    local edlab : label educ_lb `e'
    di _newline "`edlab':"
    qui sum formal if educ == `e' & female == 0
    local male_ed`e' = r(mean) * 100
    qui sum formal if educ == `e' & female == 1
    local female_ed`e' = r(mean) * 100
    local gap_ed`e' = `male_ed`e'' - `female_ed`e''
    di "  Male: " %5.1f `male_ed`e'' "%"
    di "  Female: " %5.1f `female_ed`e'' "%"
    di "  Gap: " %5.1f `gap_ed`e'' " pp"
}

* Export data for Figure 5
preserve
    drop if female == . | educ == .
    collapse (mean) formal_rate = formal, by(educ female)
    replace formal_rate = formal_rate * 100
    
    reshape wide formal_rate, i(educ) j(female)
    rename formal_rate0 male_rate
    rename formal_rate1 female_rate
    gen gap = male_rate - female_rate
    
    list
    export delimited using "$output/figure5_education.csv", replace
restore

* Figure 5: Education
preserve
    drop if female == . | educ == .
    collapse (mean) formal, by(educ female)
    replace formal = formal * 100
    
    reshape wide formal, i(educ) j(female)
    rename formal0 male
    rename formal1 female
    
    twoway (connected male educ, msymbol(O) mcolor(navy) lcolor(navy)) ///
           (connected female educ, msymbol(S) mcolor(maroon) lcolor(maroon)), ///
        ytitle("Formal Employment Rate (%)") ///
        xtitle("Education Level") ///
        xlabel(0 "None" 1 "Primary" 2 "Secondary" 3 "Higher Sec" 4 "Tertiary") ///
        title("Figure 5: Formal Employment by Education Level") ///
        legend(order(1 "Male" 2 "Female"))
    
    graph export "$output/figure5_education.png", replace width(1200)
restore


********************************************************************************
* SECTION 9: REGRESSION ANALYSIS
********************************************************************************

* Basic gender gap
reg formal i.female, robust

* Adding education
reg formal i.female i.educ, robust

* Adding occupation
reg formal i.female i.educ i.occ_code, robust

* Adding industry
reg formal i.female i.educ i.occ_code i.industry_group, robust

* Full model with wealth and location
reg formal i.female i.educ i.occ_code i.industry_group i.wealth_quintile i.urban i.quarter, robust


********************************************************************************
* SECTION 10: SUMMARY STATISTICS FOR PAPER
********************************************************************************

di _newline "========================================"
di "SUMMARY STATISTICS FOR PAPER"
di "========================================"

* Total sample
count
di "Total employed sample: " r(N)

* By gender
count if female == 0
di "Male: " r(N)
count if female == 1
di "Female: " r(N)

* Overall formality rates
sum formal if female == 0
di "Male formality rate: " %5.1f r(mean)*100 "%"
sum formal if female == 1
di "Female formality rate: " %5.1f r(mean)*100 "%"

* Agriculture share
sum in_agriculture if female == 0
di "Male agriculture share: " %5.1f r(mean)*100 "%"
sum in_agriculture if female == 1
di "Female agriculture share: " %5.1f r(mean)*100 "%"

* Professional occupation share
sum professional_occ if female == 0
di "Male professional share: " %5.1f r(mean)*100 "%"
sum professional_occ if female == 1
di "Female professional share: " %5.1f r(mean)*100 "%"


********************************************************************************
* CLEANUP
********************************************************************************

log close

di _newline "Analysis complete. Output saved to: $output"

